# Who Do You Trust? - Daniel Goldstein and Johannes Wiedemann
# R-script for Data Preparation (except for MRP part of the paper)


#====================================================================================================
# load packages

library(tidyverse)
library(readxl)
library(lfe)
library(interactions)
library(estimatr)
library(openxlsx)
library(texreg)
library(dotwhisker)
library(stargazer)
library(cobalt)
library(stringi)



#=======================================================================

# Change the working directory as appropriate
setwd("~/../GoldsteinWiedemann_supplementary/Data and Data Preparation")



#=======================================================================

# read in data




# First, read in county-covariates (most drawn from the 2010 census) from the 
# Atlas of Rural and Small Town America (https://www.ers.usda.gov/data-products/atlas-of-rural-and-small-town-america/download-the-data/)
# provided by the US Department of Agriculture's Economic Research Service

ruralAtlas1 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 6)
ruralAtlas1 <- rename(ruralAtlas1,State_abb=State)
ruralAtlas1 <- ruralAtlas1 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))


ruralAtlas2 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 3)
ruralAtlas2 <- rename(ruralAtlas2,State_abb=State)
ruralAtlas2 <- ruralAtlas2 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))

ruralAtlas3 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 5)
ruralAtlas3 <- rename(ruralAtlas3,State_abb=State)
ruralAtlas3 <- ruralAtlas3 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPStxt))

ruralAtlas4 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 4)
ruralAtlas4 <- rename(ruralAtlas4,State_abb=State)
ruralAtlas4 <- ruralAtlas4 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))



# read in social capital data (taken from JEC Social Capital Project: https://www.jec.senate.gov/public/index.cfm/republicans/socialcapitalproject)

socCapital_alt_1 <- read.xlsx("Raw Data/Copy of social-capital-project-social-capital-index-data.xlsx",sheet = 3)
colnames(socCapital_alt_1) <- socCapital_alt_1[1,]
socCapital_alt_1 <- socCapital_alt_1[-1,]
socCapital_alt_1 <- socCapital_alt_1[,c(5,6,8,13)]
socCapital_alt_1 <- socCapital_alt_1 %>% mutate(sk2017_alt=as.numeric(`County-Level Index`),InstHealth=as.numeric(`Institutional Health`),County=paste0(County," County"),State_abbr=`State Abbreviation`) %>% select(County,State_abbr,sk2017_alt,InstHealth)

socCapital_alt_2 <- read.xlsx("Raw Data/Copy of social-capital-project-social-capital-index-data.xlsx",sheet = 5)
colnames(socCapital_alt_2) <- socCapital_alt_2[1,]
socCapital_alt_2 <- socCapital_alt_2[-1,]
socCapital_alt_2 <- socCapital_alt_2[,c(5,6,12,13,14,16,17)] # Confidence in Instituionts Subindex and Informal Civic Engagement Subindex are on State Level!
socCapital_alt_2 <- socCapital_alt_2 %>% mutate(County_Name=paste0(County," County"),CivicEngagement=as.numeric(`Informal Civic Engagement Subindex`),ConfidenceInstitutions=as.numeric(`Confidence in Institutions Subindex`),CrimeRate=as.numeric(`Violent Crimes p 100,000`),Turnout12and16=as.numeric(`Presidential election voting rate, 2012 & 2016`)/100,ReligCongreg=as.numeric(`Religious congregations p 1,000`))
socCapital_alt_2 <- rename(socCapital_alt_2,State_abb=`State Abbreviation`)
socCapital_alt_2 <- socCapital_alt_2[,-c(2:7)]

socCapital_alt_3 <- read.xlsx("Raw Data/Copy of social-capital-project-social-capital-index-data.xlsx",sheet = 7)
socCapital_alt_3 <- socCapital_alt_3[,-c(1:4,7)] 
socCapital_alt_3 <- socCapital_alt_3 %>% mutate(County_Name=paste0(County," County"))
socCapital_alt_3 <- rename(socCapital_alt_3,State_abb=State.Abbreviation)
socCapital_alt_3 <- socCapital_alt_3[,-2]




# handcoded governor partisanship by state, based on National Governors Association (https://www.nga.org/governors/)

governor_2020_Rep <- cbind.data.frame(as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC")),c(1,1,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,0,1,0,0,0,0,1,1,1,0,0,0,1,1,1,1,1,1,0,0,1,0,1))
governor_2020_Rep <- rename(governor_2020_Rep,State_abb=`as.vector(setdiff(unique(socCapital_alt_3$State_abb), "DC"))`,Republican_gov_2020=`c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, `)




# handcoded data on timing of stay-at-home orders, based on CNN
# see: https://www.cnn.com/2020/03/23/us/coronavirus-which-states-stay-at-home-order-trnd/index.html

stayhomeorders <- cbind.data.frame(as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC")),c("20/04/05","20/03/28","20/03/31",NA,"20/03/19","20/03/26","20/03/23","20/03/24","20/04/03","20/04/03","20/03/23","20/03/25","20/03/21","20/03/24",NA,"20/03/30","20/03/26","20/03/23","20/04/02","20/03/30","20/03/24","20/03/24","20/03/27","20/04/03","20/04/06","20/03/28",NA,"20/04/01","20/03/27","20/03/21","20/03/24","20/03/22","20/03/30",NA,"20/03/23","20/04/01","20/03/23","20/04/01","20/03/28","20/04/07",NA,"20/04/02","20/04/02",NA,"20/03/30","20/03/25","20/03/23","20/03/24","20/03/25",NA))
stayhomeorders <- stayhomeorders %>% mutate(`as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC"))`=as.character(`as.vector(setdiff(unique(socCapital_alt_3$State_abb), "DC"))`))
stayhomeorders <- stayhomeorders[,-1]
stayhomeorders <- rbind.data.frame(stayhomeorders,c("20/04/01","DC"))
stayhomeorders <- rename(stayhomeorders,State_abb=`as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC"))`,StayOrder_Date=`c("20/04/05", "20/03/28", "20/03/31", NA, "20/03/19", "20/03/26", `)
stayhomeorders <- stayhomeorders %>% mutate(Stayorder_Date_date=as.Date(StayOrder_Date,"%y/%m/%d"))




# merge the datasets by County and State

data1 <- left_join(ruralAtlas1,ruralAtlas2,by=c("State_abb","FIPS_num"))
data1 <- left_join(data1,ruralAtlas3,by=c("State_abb","FIPS_num"))
data1 <- left_join(data1,ruralAtlas4,by=c("State_abb","FIPS_num"))


data1 <- data1 %>% select(-c(County.x.x,County.y,County.y.y,County_Name.x.x,County_Name.y,County_Name.y.y))
data1 <- rename(data1,County_Name="County_Name.x")
data1 <- rename(data1,County="County.x")


data1 <- left_join(data1,socCapital_alt_1,by=c("State_abb"="State_abbr","County_Name"="County"))
data1 <- left_join(data1,socCapital_alt_3,by=c("State_abb","County_Name"))
data1 <- left_join(data1,socCapital_alt_2,by=c("State_abb","County_Name"))




# read in County-level election results for 2008, 2012, 2016 (from MIT Election lab: https://electionlab.mit.edu/data)

elec_2016alt <- read.csv("Raw Data/countypres_2000-2016.csv")

# fix two data issues: 1) correct the FIPS code for Oglala Lakota County, SD; 2) drop Alaska observations, because election results not recorded on county level
elec_2016alt <- elec_2016alt %>% mutate(FIPS=ifelse(county=="Oglala Lakota",46102,FIPS))
elec_2016alt <- elec_2016alt %>% filter(state!="Alaska")



elec_2016alt <- elec_2016alt %>% filter(year%in% c(2008,2012,2016)) %>% filter(party %in% c("democrat","republican"))
elec_2016alt <- elec_2016alt[,-7]

elec_2016alt <- elec_2016alt %>% spread(party,candidatevotes)

elec_2016alt <- elec_2016alt[,-c(2,4,6,8)]

# Calculate percentage Trump won of total votes in county
elec_2016alt <- elec_2016alt %>% 
  mutate(
    gop_per =   republican / totalvotes,dem_per=democrat/totalvotes
  )

elec_2016alt2 <- elec_2016alt %>% mutate(gop_win=if_else(gop_per>dem_per,1,0))

elec_2016alt <- elec_2016alt[,-c(2,4:6,8)]
elec_2016alt <- elec_2016alt %>% filter(is.na(FIPS)==FALSE)

elec_2016alt <- elec_2016alt %>% spread(year,gop_per)
elec_2016alt <- rename(elec_2016alt,fips_code=FIPS,gop_per_2008=`2008`,gop_per_2012=`2012`,gop_per_2016=`2016`)




elec_2016alt2 <- elec_2016alt2[,-c(2,4:8)]
elec_2016alt2 <- elec_2016alt2 %>% filter(is.na(FIPS)==FALSE)

elec_2016alt2 <- elec_2016alt2 %>% spread(year,gop_win)
elec_2016alt2 <- rename(elec_2016alt2,fips_code=FIPS,gop_win_2008=`2008`,gop_win_2012=`2012`,gop_win_2016=`2016`)

elec_2016alt <- left_join(elec_2016alt,elec_2016alt2)




# merge election data and governor partisanship data with previous data

data1 <- left_join(data1,elec_2016alt,by=c("FIPS_num"="fips_code"))

data1 <- left_join(data1,governor_2020_Rep,by="State_abb")




# read in vaccination rates by county data (from CMS Mapping Medicare Disparities Tool: https://data.cms.gov/mapping-medicare-disparities)

vaccination1 <- read.csv("Raw Data/mmd_data.csv")


# merge vaccination rates data with previous data

data1 <- left_join(data1,vaccination1,by=c("FIPS_num"="fips"))




# The dataset compiled up to this point only draws on publicly available data.
# Researchers interested in replicating our final dataset can merge this dataset with the Cuebiq data, once they obtained access (see read.me).

save(data1,file="GW_2020_MainDataSet_withoutCuebiq_122220.RData") 

#load(file="GW_2020_MainDataSet_withoutCuebiq_122220.RData") 



#===================================================================================================

# read in Cuebiq data (see read.me on how to obtain access)
# notice: we used the data version from April 29, 2020



cuebiq_april_10 <- read.csv("Raw Data/cmi-20200429.csv")

cuebiq_19 <- cuebiq_april_10 %>% filter(as.Date(ref_dt)<"2020-01-01")


# this daily data from 2019 will be necessary to perform a sanity check on whether the data reflect mobility
cuebiq_2019_daily <- cuebiq_19

# we limited our sample from January 1st, 2020 to April, 20, 2020
cuebiq_april_10 <- cuebiq_april_10 %>% filter(as.Date(ref_dt)>="2020-01-01") %>% filter(as.Date(ref_dt)<"2020-04-20")


# we merge mobility data from 2019 on the county-week level
cuebiq_19 <- cuebiq_19 %>% group_by(county_fips_code,week_name) %>% arrange(ref_dt) %>%mutate(cmi_weekly_2019=mean(cmi,na.rm=TRUE),sheltered_in_place_2019=mean(sheltered_in_place,na.rm=TRUE),less_10_mile_2019=mean(less_10_mile,na.rm=TRUE),less_1_mile_2019=mean(less_1_mile,na.rm=TRUE))
cuebiq_19 <- cuebiq_19 %>% group_by(county_fips_code,week_name) %>% distinct(county_fips_code,week_name,.keep_all = TRUE) %>% ungroup() %>% select(county_fips_code,week_name,cmi_weekly_2019,sheltered_in_place_2019,less_10_mile_2019,less_1_mile_2019)


cuebiq_19 <- rename(cuebiq_19,week_name_2019=week_name)

cuebiq_19 <- cuebiq_19 %>% mutate(week_name_2020=paste0("2020",stri_sub(week_name_2019,5,8)))

cuebiq_april_10 <- left_join(cuebiq_april_10,cuebiq_19,by=c("week_name"="week_name_2020","county_fips_code"))




# merge the datasets on covariates (e.g. social capital, partisanship, county-level controls) and mobility data (provided by Cuebiq) by County and State
data_time_series_1 <-  left_join(cuebiq_april_10,data1,by=c("county_fips_code"="FIPS_num"))


data_time_series_1 <-  left_join(data_time_series_1,stayhomeorders,by="State_abb")


# code whether given date was pre or post implementation of stay-at-home order
data_time_series_1 <- data_time_series_1 %>% mutate(post_treatment=if_else(as.Date(ref_dt)>Stayorder_Date_date,1,0))
data_time_series_1 <- data_time_series_1 %>% mutate(post_treatment2=if_else(is.na(post_treatment)==TRUE,0,post_treatment))



# create cutoffs for binary versions of social capital, turnout, vaccination rates
median_sk2017_alt <- median(data_time_series_1$sk2017_alt,na.rm=TRUE)
data_time_series_1 <- data_time_series_1 %>% mutate(sk2017_alt_bin=if_else(sk2017_alt>median_sk2017_alt,1,0,missing =NULL ))


median_Turnout1216 <- median(data_time_series_1$Turnout12and16,na.rm=TRUE)
data_time_series_1 <- data_time_series_1 %>% mutate(Turnout12and16_bin=if_else(Turnout12and16>median_Turnout1216,1,0,missing =NULL ))


median_vaccination <- median(data_time_series_1$analysis_value,na.rm=TRUE)
data_time_series_1 <- data_time_series_1 %>% mutate(vaccination_bin=if_else(analysis_value>median_vaccination,1,0,missing =NULL ))




# create weekly panel data

# take mean of Cuebiq-provided variables by week and keep one observation per week
data_weekly_series_1 <- data_time_series_1 %>% group_by(FIPS.y,week_name) %>% arrange(ref_dt) %>%mutate(cmi_weekly=mean(cmi,na.rm=TRUE),shelter_2020=mean(sheltered_in_place,na.rm=TRUE),less_10_2020=mean(less_10_mile,na.rm=TRUE),less_1_2020=mean(less_1_mile,na.rm=TRUE))
data_weekly_series_1 <- data_weekly_series_1 %>% group_by(FIPS.y,week_name) %>% distinct(FIPS.y,week_name,.keep_all = TRUE) %>% ungroup()
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(month=as.factor(stri_sub((as.Date(ref_dt)+4),6,7)))


# create treatment variable used throughout analysis: has order been implemented in first three days of a given week?
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(post_treatment3=if_else((as.Date(ref_dt)-Stayorder_Date_date)>(-4),1,0,missing = 0))


# create variables that indicate treatment weeks on the county-week level
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(week_num=as.numeric(stri_sub(week_name,7,8)))
dummy1 <- data_weekly_series_1 %>% group_by(FIPS.y,week_name) %>% arrange(week_name) %>% slice(which(post_treatment3==1)[1]) %>% ungroup() %>%distinct(FIPS.y,.keep_all = TRUE) 
dummy2 <- dummy1 %>% mutate(TreatmentWeek=1) %>% select(FIPS.y,week_name,TreatmentWeek)
dummy3 <- dummy1 %>% mutate(TreatmentWeek_num=week_num) %>% select(FIPS.y,TreatmentWeek_num)
data_weekly_series_1 <- left_join(data_weekly_series_1,dummy2,by=c("FIPS.y","week_name"))

data_weekly_series_1 <- data_weekly_series_1 %>% mutate(TreatmentWeek2=if_else(is.na(TreatmentWeek)==TRUE,0,TreatmentWeek))
data_weekly_series_1 <- left_join(data_weekly_series_1,dummy3,by=c("FIPS.y"))


# create variables that indicate the relative timely distance from treatment week, on the county-week level
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(TreatmentWeek_rel=(week_num-TreatmentWeek_num))
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(TreatmentWeek_num2=if_else(is.na(TreatmentWeek_num)==TRUE,17,TreatmentWeek_num)) # never treated counties have been assigned a weekly value after our study concludes
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(TreatmentWeek_rel2=(week_num-TreatmentWeek_num2))

# creat variables for never-treated counties, as well as for counties in a) never treated states, b) CA, c) MI, and d) NC (for parallel trends illustrative exercise)
data_weekly_series_1 <- data_weekly_series_1 %>% ungroup() %>%mutate(NeverTreated=if_else(is.na(TreatmentWeek_num)==TRUE,1,0))
data_weekly_series_1 <- data_weekly_series_1 %>% ungroup() %>%mutate(NeverTreated_CA_MI_NC=if_else(State_abb=="CA","CA",if_else(State_abb=="MI","MI",if_else(State_abb=="NC","NC",if_else(NeverTreated==1,"Never","Rest")))))

# vaccination rates and county-level controls as percentages, where appropriate (to make them consistent with how other variables were coded)
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(analysis_value=analysis_value/100)
data_weekly_series_1 <- data_weekly_series_1 %>% mutate(UnempRate2018=UnempRate2018/100,WhiteNonHispanicPct2010=WhiteNonHispanicPct2010/100,BlackNonHispanicPct2010=BlackNonHispanicPct2010/100,AsianNonHispanicPct2010=AsianNonHispanicPct2010/100,Age65AndOlderPct2010=Age65AndOlderPct2010/100,PctEmpAgriculture=PctEmpAgriculture/100,PctEmpManufacturing=PctEmpManufacturing/100,PctEmpServices=PctEmpServices/100,Ed5CollegePlusPct=Ed5CollegePlusPct/100)



save(data_weekly_series_1,file="GW_2020_MainDataSet_122220.RData") 
save(cuebiq_2019_daily,file="GW_2020_DataSetDaily2019_122220.RData") 

# Data preparation ends here
#=====================================================================================================
#=====================================================================================================